###################################################################################################
# Reducing Model Misspecification and Bias in the Estimation of Interactions ######################
# Matthew Blackwell & Michael Olson ###############################################################
# Direct Primaries and Non-Major Party Voting in the American North and South #####################
###################################################################################################

###################################################################################################
# load required packages, and set seed for replication #####################
###################################################################################################
  
  
# packages
  
  require("foreign")
  require("plyr")
  require("ggplot2")
  require("lfe")
  require("inters")
  require("RColorBrewer")
  require("KRLS")
  require("lmtest")
  require("BART")
  
# seed and number of bootstrap iterations

  set.seed(63116)
  n_boots <- 5000
    
# define a color
    
  skyblue <- rgb(236, 240, 241, max = 255)
  
###################################################################################################
# construct dataset ###############################################################################
###################################################################################################  

# import data for direct primary introduction analysis #
# this is data from ICPSR 6895, "Party Strength in the United States: 1872-1996" #

  primdat <- read.dta("data/06895-0001-Data.dta")

# rename variables and change the state variable to characters #

  primdat <- plyr::rename(primdat,replace=c("GEOCODE"="state","YEAR"="year"))
  
  primdat$state <- as.character(primdat$state)

# because new york is coded separately by candidate and party, keep the party code #
# additionally, drop observations that are for non-state regions #

  primdat[primdat$state=="New York (Party)","state"] <- "New York"
  
  primdat <- primdat[!is.element(primdat$state,c("Northeast Region","Middle West Region",
                                     "South Region","West Region","Republican States (1896-1930)",
                                     "Democratic States (1896-1930)","Competitive States (1896-1930)",
                                     "Nation","New York (Candidate)")),]
  
# make it a dataset for the house  
  
  primdat <- primdat[,c("state","year","DCONG","RCONG")]
  
# create the non-two party vote share
  
  primdat[primdat$DCONG>800,"DCONG"] <- NA; primdat[primdat$RCONG>800,"RCONG"] <- NA # NAs are coded as 999
  primdat$other_share <- round(100-(primdat$DCONG+primdat$RCONG),2)
  primdat[primdat$other_share<0 & !is.na(primdat$other_share),"other_share"] <- 0 # these are cases within 1 of 0 that are just rounding issues

# now we add in the year that primaries were adopted
# we use the years from Hirano and Snyder (2019)
  
  primdat$primary_year <- NA
  
  primdat[primdat$state=="Alabama","primary_year"] <- 1902 # year that state democrats begin using primary
  primdat[primdat$state=="Alaska","primary_year"] <- 1958 # doesn't matter, not in our sample
  primdat[primdat$state=="Arizona","primary_year"] <- 1909 # law in constitution at statehood
  primdat[primdat$state=="Arkansas","primary_year"] <- 1900 # year that state democrats begin using primary
  primdat[primdat$state=="California","primary_year"] <- 1909 # year that law was passed
  primdat[primdat$state=="Colorado","primary_year"] <- 1911 # law passed 1910, no evidence that primary held that year -- no primary data for 1910 but yes for 1912 on SOS website #########################
  primdat[primdat$state=="Connecticut","primary_year"] <- 1955 # doesn't matter, out of sample
  primdat[primdat$state=="Delaware","primary_year"] <- 1969 # doesn't matter, out of sample
  primdat[primdat$state=="Florida","primary_year"] <- 1902 # year that state democrats begin using primary 
  primdat[primdat$state=="Georgia","primary_year"] <- 1898 # year that state democrats begin using primary
  primdat[primdat$state=="Hawaii","primary_year"] <- 1959 # doesn't matter, out of sample
  primdat[primdat$state=="Idaho","primary_year"] <- 1909 # year that law was passed
  primdat[primdat$state=="Illinois","primary_year"] <- 1908 # year that law was passed, primaries held
  primdat[primdat$state=="Indiana","primary_year"] <- 1915 # effective 1916, but that shouldn't matter here (we have even years)
  primdat[primdat$state=="Iowa","primary_year"] <- 1907 # year that law was passed
  primdat[primdat$state=="Kansas","primary_year"] <- 1908 # law was passed in 1908, seems that primaries were held that year (https://www.newspapers.com/image/425751352/?terms=primary%20election%20congress&match=1)
  primdat[primdat$state=="Kentucky","primary_year"] <- 1912 # law was passed in 1912, seems that primaries were held that year (https://www.newspapers.com/image/376074317/?terms=primary%20election%20congress&match=1)
  primdat[primdat$state=="Louisiana","primary_year"] <- 1904 # year that state democrats begin using primary
  primdat[primdat$state=="Maine","primary_year"] <- 1911 # year that law is passed
  primdat[primdat$state=="Maryland","primary_year"] <- 1911 # law passed in 1910, seems as if primaries were held that year (https://www.newspapers.com/image/372412270/?terms=primary%20election%20congress&match=1,
                                                                                                                           # https://www.newspapers.com/image/372417762/?terms=primary%20election%20congress&match=1)
  primdat[primdat$state=="Massachusetts","primary_year"] <- 1911 # year that law is passed
  primdat[primdat$state=="Michigan","primary_year"] <- 1909 # year that law is passed
  primdat[primdat$state=="Minnesota","primary_year"] <- 1901 # year that law is passed
  primdat[primdat$state=="Mississippi","primary_year"] <- 1903 # law passed in 1902, seems that primaries were held that year (https://www.newspapers.com/image/465083688/?terms=primary%20election%20congress&match=1)
  primdat[primdat$state=="Missouri","primary_year"] <- 1907 # year that law is passed
  primdat[primdat$state=="Montana","primary_year"] <- 1913 # law is passed in 1912, no evidence primaries held that year, document says 1914 is first primary ####################
  primdat[primdat$state=="Nebraska","primary_year"] <- 1907 # year that law is passed
  primdat[primdat$state=="Nevada","primary_year"] <- 1909 # year that law is passed; 1915 had indirect primaries -- see below
  primdat[primdat$state=="New Hampshire","primary_year"] <- 1909 # year that law is passed
  primdat[primdat$state=="New Jersey","primary_year"] <- 1911 # year that law is passed
  primdat[primdat$state=="New Mexico","primary_year"] <- 1939 # doesn't matter -- outside sample
  primdat[primdat$state=="New York","primary_year"] <- 1913 # year that law is passed
  primdat[primdat$state=="North Carolina","primary_year"] <- 1915 # year that law is passed
  primdat[primdat$state=="North Dakota","primary_year"] <- 1907 # year that law is passed
  primdat[primdat$state=="Ohio","primary_year"] <- 1913 # year that law is passed
  primdat[primdat$state=="Oklahoma","primary_year"] <- 1908 # law passed in 1908, seems like primaries held that year (https://www.newspapers.com/image/585859231/?terms=primary%20election%20congress&match=1)
  primdat[primdat$state=="Oregon","primary_year"] <- 1906  # law effective in 1906, passed 1904 
  primdat[primdat$state=="Pennsylvania","primary_year"] <- 1907 # year that law is passed
  primdat[primdat$state=="Rhode Island","primary_year"] <- 1947 # doesn't matter - outside sample
  primdat[primdat$state=="South Carolina","primary_year"] <- 1892 # year that state democrats begin using primary 
  primdat[primdat$state=="South Dakota","primary_year"] <- 1907 # year that law is passed
  primdat[primdat$state=="Tennessee","primary_year"] <- 1908 # law passed in 1909, but law struck down; see below
  primdat[primdat$state=="Texas","primary_year"] <- 1905 # year that law is passed
  primdat[primdat$state=="Utah","primary_year"] <- 1937 # doesn't matter -- outside sample
  primdat[primdat$state=="Vermont","primary_year"] <- 1915 # year that law is passed
  primdat[primdat$state=="Virginia","primary_year"] <- 1905  # year that democratic party starts using  for statewide offices but doesn't specify for house
  primdat[primdat$state=="Washington","primary_year"] <- 1907 # year that law is passed
  primdat[primdat$state=="West Virginia","primary_year"] <- 1915 # year that law is passed
  primdat[primdat$state=="Wisconsin","primary_year"] <- 1906 # law passed 1903, effective 1906
  primdat[primdat$state=="Wyoming","primary_year"] <- 1911 # year that law is passed

  primdat$primary_end_year <- 2000
  primdat[primdat$state=="Idaho","primary_end_year"] <- 1919  # idaho resumed after 1931 but this is outside of sample
  
  primdat$cong_treated <- 0
  primdat[primdat$year>=primdat$primary_year & primdat$year<=primdat$primary_end_year,"cong_treated"] <- 1

# exceptions
  
  # according to hirano and snyder, TN passed a mandatory primary law in 1909 that was 
  # struck down by the state supreme court in 1910; but, primaries seem to have been held in 
  # 1908 and 1912, apparently, but no evidence for 1914 and 1916; permanent law then passed in 1917
  
  primdat[(primdat$year==1910 | primdat$year==1914 | primdat$year==1916) & primdat$state=="Tennessee","cong_treated"] <- 0
  
  # indirect primaries replaced direct primaries in Nevada in 1915, but direct primaries re-passed in 1917
  
  primdat[primdat$year==1916 & primdat$state=="Nevada","cong_treated"] <- 0

# limit to relevant years

  primdat <- primdat[primdat$year>=1880 & primdat$year<=1930,]

# define the south

  primdat$south <- ifelse(is.element(primdat$state,
                                     c("Alabama","Arkansas","Florida","Georgia",
                                       "Louisiana","Mississippi","North Carolina",
                                       "South Carolina","Tennessee","Texas","Virginia")),1,0)
  
# limit the data to non-missing
  
  primdat <- primdat[!is.na(primdat$other_share),]

# now we make some matrix-form versions of the dataset, for certain estimators
  
# make a model matrix of the fixed effects (note we use the sum coding)

  contr.list <- list(contr.sum, contr.sum)
  names(contr.list) <- c("factor(state)","factor(year)")
  mod_mat <- model.matrix(~factor(state)+factor(year),data=primdat,contrasts.arg=contr.list)[,-1]
  
# X is all of the controls (here just fixed effects)
# XV is all of the control-moderator interactions (note we only interact the year fixed effects, since the state ones are collinear with south)
  
  X <- as.matrix(mod_mat)
  XV <- as.matrix(X[,grep("factor\\(year\\)",colnames(X))]*primdat$south)
  
# then we add these to treatment, moderator, and their interaction  
  
  big_x <- as.matrix(cbind(d = primdat$cong_treated, d_V = primdat$cong_treated*primdat$south, X = X, XV = XV))
  small_x <- cbind(d = primdat$cong_treated,V=primdat$south, X = X)

###################################################################################################
# estimation ######################################################################################
###################################################################################################  
  
# regression models ###############################################################################  
  
# single interaction model
  
  simple <- felm(other_share~south*(cong_treated)-south|state+year|0|state,data=primdat)
  
# fully moderated model  
  
  full   <- felm(other_share~south*(cong_treated+factor(year))-south-factor(year)|state+year|0|state,data=primdat, keepCX = TRUE, contrasts = list(`factor(year)` = contr.sum))
  
# post-lasso models ###############################################################################  
  
# post-single selection  
  
  post_single_lasso <- post_ds_interaction(data = primdat,
                                           treat = "cong_treated",
                                           moderator = "south",
                                           outcome = "other_share",
                                           control_vars = NULL,
                                           panel_vars = c("state","year"),
                                           moderator_marg = TRUE,
                                           cluster = "state",
                                           method="single selection")
  post_single_lasso_vcov <- post_single_lasso$vcv
  
# post-double selection
  
  post_lasso <- post_ds_interaction(data = primdat,
                                    treat = "cong_treated",
                                    moderator = "south",
                                    outcome = "other_share",
                                    control_vars = NULL,
                                    panel_vars = c("state","year"),
                                    moderator_marg = TRUE,
                                    cluster = "state")
  post_lasso_vcov <- post_lasso$vcv
  
# kernel regularized least squares ################################################################
  
  krls_out <- KRLS::krls(y = c(primdat$other_share), X = small_x,derivative = F)
  
  # get estimate of interaction
  
    small_x_pred <- rbind(cbind(d = 1,V=1,small_x[, -c(1,2)]),
                          cbind(d = 0,V=1,small_x[, -c(1,2)]),
                          cbind(d = 1,V=0,small_x[, -c(1,2)]),
                          cbind(d = 0,V=0,small_x[, -c(1,2)]))
    preds <- predict(krls_out, newdata = small_x_pred, se = TRUE)
    n_units <- nrow(small_x)
    ws <- c(1 / n_units, -1 / n_units, -1 / n_units, 1 / n_units)
    h <- matrix(rep(ws, each = n_units), ncol = 1)
    krls_int <- as.vector(t(h) %*% preds$fit)
    krls_intse <- as.vector(sqrt(t(h)%*%preds$vcov.fit%*%h))*sqrt(4)
    
  # get estimate of base term (effect of primary in non-south)
    
    small_x_pred2 <- rbind(cbind(d = 1,V=0,small_x[, -c(1,2)]),
                          cbind(d = 0,V=0,small_x[, -c(1,2)]))
    preds <- predict(krls_out, newdata = small_x_pred2, se = TRUE)
    n_units <- nrow(small_x)
    ws <- c(1 / n_units, -1 / n_units)
    h <- matrix(rep(ws, each = n_units), ncol = 1)
    krls_main <- as.vector(t(h) %*% preds$fit)
    krls_mainse <- as.vector(sqrt(t(h)%*%preds$vcov.fit%*%h))*sqrt(2)

# bayesian additive regression trees ##############################################################

  small_x_bart <- rbind(
    c(d = 1, V = 1, colMeans(X)),
    c(d = 0, V = 1, colMeans(X)),
    c(d = 1, V = 0, colMeans(X)),
    c(d = 0, V = 0, colMeans(X))
  )
  
  bart_out <- BART::wbart(small_x, primdat$other_share, ndpost = 80000, nskip=20000,
                          x.test = small_x_bart)

  bart_main <- (bart_out$yhat.test[, 3] - bart_out$yhat.test[, 4])
  bart_main_ci <- quantile(bart_main, probs = c(0.025, 0.975))
  bart_int <- (bart_out$yhat.test[, 1] - bart_out$yhat.test[, 2] -
                 bart_out$yhat.test[, 3] + bart_out$yhat.test[, 4] )
  bart_main <- mean(bart_main)
  bart_int_ci <- quantile(bart_int, probs = c(0.025, 0.975))

  bart_int <- mean(bart_int)
  
# adaptive lasso ##################################################################################
  
# get penalties using a ridge regression
  
  ridge_out <- glmnet::cv.glmnet(x = big_x, y = c(primdat$other_share), alpha = 0)
  ridge_coefs <- as.numeric(coef(ridge_out, s = "lambda.1se"))[-1]
  alasso_pens <- 1 / abs(ridge_coefs)

  alasso_pens[1:(ncol(X) + 2)] <- 0  # force the coefficients from the original model to say in
  
  # estimate the lasso
  
  alasso_out <- glmnet::cv.glmnet(x = big_x, y = c(primdat$other_share), penalty.factor = alasso_pens)
  alasso_coefs <- as.numeric(coef(alasso_out, s = "lambda.1se"))[-1]
  names(alasso_coefs) <- colnames(big_x)
  
  # now bootstrap
  
  alasso_boots <- matrix(NA, nrow = n_boots, ncol = 3)
  
  for (b in 1:n_boots) {
    if (b %% 100 == 0) cat(b, "...\n")    
    caseids <- sample(unique(primdat$state),length(unique(primdat$state)),replace=T)
    
    bit <- list()
    
    for(j in 1:length(unique(primdat$state))){
      
      bit[[j]] <- primdat[primdat$state==caseids[j],]
      
    }
    
    bootdat <- do.call(rbind,bit)
    
    mod_mat_boot <- model.matrix(~factor(state)+factor(year),data=bootdat,contrasts.arg=contr.list)[,-1]
    
    X_boot <- as.matrix(mod_mat_boot)
    XV_boot <- as.matrix(X_boot[,grep("factor\\(year\\)",colnames(X_boot))]*bootdat$south)
    
    big_x_boot <- as.matrix(cbind(d = bootdat$cong_treated, d_V = bootdat$cong_treated*bootdat$south, X = X_boot, XV = XV_boot))
    
    ridge_star <- glmnet::glmnet(x = big_x_boot, y = c(bootdat$other_share), alpha = 0, lambda = ridge_out$lambda.1se)
    ridge_coefs_star <- as.numeric(coef(ridge_star))[-1]
    alasso_pens <- 1 / abs(ridge_coefs_star)
    alasso_pens[1:(ncol(X_boot) + 2)] <- 0
    alasso_star <- glmnet::glmnet(x = big_x_boot, y = c(bootdat$other_share), penalty.factor = alasso_pens,
                                  lambda = alasso_out$lambda.1se)
    alasso_coefs_star <- as.numeric(coef(alasso_star))[-1]
    names(alasso_coefs_star) <- colnames(big_x_boot)
    alasso_boots[b, ] <- c(alasso_coefs_star[c("d", "d_V")],sum(alasso_coefs_star[c("d", "d_V")]))
    
  }
  

###################################################################################################
# make figures ####################################################################################
###################################################################################################  

# create a dataframe with relevant coefficients
  
  # start with the estimators that produce standard errors

  coefficients <- as.data.frame(rbind(summary(simple)$coefficients[,1:2],
                                      summary(full)$coefficients[grep("year",rownames(summary(full)$coefficients),invert=TRUE),][,1:2],
                                      coeftest(post_lasso,post_lasso_vcov)[c("cong_treated","cong_treated_south"),1:2],
                                      coeftest(post_single_lasso,post_single_lasso_vcov)[c("cong_treated","cong_treated_south"),1:2],
                                      c(krls_main,krls_mainse),
                                      c(krls_int,krls_intse)
                                      ))
  
  # make 95% confidence intervals
  
  coefficients$lower_ci <- coefficients[,1]-1.96*coefficients[,2]
  coefficients$upper_ci <- coefficients[,1]+1.96*coefficients[,2]
  
  coefficients$`Cluster s.e.` <- NULL
  
  # now add BART and adaptive lasso, which just had CI to begin with
  
  alasso_chunk <- as.data.frame(cbind(c(alasso_coefs[1:2],sum(alasso_coefs[1:2])),t(apply(alasso_boots,2,function(x) quantile(x, probs = c(0.025, 0.975))))))
  colnames(alasso_chunk) <- c("Estimate","lower_ci","upper_ci"); rownames(alasso_chunk) <- NULL
  
  coefficients <- rbind(coefficients,
                        c(bart_main,bart_main_ci),
                        c(bart_int,bart_int_ci),
                        alasso_chunk
                        )
                        
  # drop rownames, add information on estimators and QOI
  
  rownames(coefficients) <- NULL
  
  coefficients$Method <- c("Single Interaction","Single Interaction",
                              "Fully Moderated","Fully Moderated",
                              "Post-Double Selection","Post-Double Selection",
                              "Post-Lasso","Post-Lasso",
                              "KRLS","KRLS","BART","BART","ALasso","ALasso","ALasso")
  
  coefficients$QOI       <- c(rep(c("Marginal Effect, North","Interaction"),7),"Marginal Effect, South")
  
  # now we make and add the marginal effect in the South
  
  margeff_south_simple <- sum(coefficients$Estimate[coefficients$Method=="Single Interaction"])
  margeff_south_full   <- sum(coefficients$Estimate[coefficients$Method=="Fully Moderated"])
  margeff_south_pl   <- sum(coefficients$Estimate[coefficients$Method=="Post-Double Selection"])
     
  margeff_south_simple_se <- sqrt(simple$clustervcv["cong_treated","cong_treated"]+
                             simple$clustervcv["south:cong_treated","south:cong_treated"]+
                             2*simple$clustervcv["south:cong_treated","cong_treated"])

  margeff_south_full_se <- sqrt(full$clustervcv["cong_treated","cong_treated"]+
                                  full$clustervcv["south:cong_treated","south:cong_treated"]+
                                    2*full$clustervcv["south:cong_treated","cong_treated"])

  margeff_south_pl_se <- sqrt(post_lasso_vcov["cong_treated","cong_treated"]+
                                post_lasso_vcov["cong_treated_south","cong_treated_south"]+
                                2*post_lasso_vcov["cong_treated_south","cong_treated"])

  margeff_simple_row <- c(margeff_south_simple,margeff_south_simple-1.96*margeff_south_simple_se,margeff_south_simple+1.96*margeff_south_simple_se)
  margeff_full_row <- c(margeff_south_full,margeff_south_full-1.96*margeff_south_full_se,margeff_south_full+1.96*margeff_south_full_se)
  margeff_pl_row <- c(margeff_south_pl,margeff_south_pl-1.96*margeff_south_pl_se,margeff_south_pl+1.96*margeff_south_pl_se)
  
  margeff_south_coefs <- as.data.frame(rbind(margeff_simple_row,margeff_full_row,margeff_pl_row))
  colnames(margeff_south_coefs) <- c("Estimate","lower_ci","upper_ci"); rownames(margeff_south_coefs) <- NULL
  margeff_south_coefs$QOI <- "Marginal Effect, South"; margeff_south_coefs$Method <- c("Single Interaction","Fully Moderated","Post-Double Selection")
  
  coefficients <- rbind(coefficients,margeff_south_coefs)

  coefficients$Method <- factor(coefficients$Method,levels=c("Single Interaction","Fully Moderated","Post-Lasso","Post-Double Selection","ALasso","BART","KRLS"))
  coefficients$QOI <- factor(coefficients$QOI,levels=c("Marginal Effect, North","Marginal Effect, South","Interaction"))
  
# set the theme of the figure
  
  theme_set(theme_minimal() +
              theme(panel.border = element_rect(fill = NA)))
  theme_cousteau <- theme_light() +
    theme(plot.background = element_rect(fill = skyblue, color = skyblue),
          legend.background = element_rect(fill = skyblue))
  
  hues <- seq(15, 375, length = 7 + 1)
  cols <-  hcl(h = hues, l = 65, c = 100)[c(1, 2, 2, 3, 4, 5, 6, 7)]
  names(cols) <- c("Post-Double Selection", "Post-Lasso", "ALasso", "Oracle", "Fully Moderated",
                   "KRLS", "BART", "Single Interaction")
  shapes <- c(15, 18, 18, 3, 17, 7, 19, 9)
  names(shapes) <- c("Post-Double Selection", "Post-Lasso", "ALasso", "Oracle", "Fully Moderated",
                     "KRLS", "BART", "Single Interaction")
  
# make figures
  
  cairo_pdf("figures/primary_plot.pdf", width = 8, height = 4, family = "Verdana")
  print(
  ggplot(coefficients[is.element(coefficients$Method,c("Single Interaction",
                                                       "Fully Moderated",
                                                       "Post-Double Selection",
                                                       "ALasso")),], aes(x = QOI, y = Estimate, group = Method,
                           colour = Method, pch = Method)) +
    geom_hline(yintercept = 0, linetype = 2, size = 1, colour = "black") +
    geom_point(size = 3, position = position_dodge(width = 0.5)) +
    geom_errorbar(size = 0.5, aes(ymin = lower_ci,
                                   ymax = upper_ci),
                  position = position_dodge(width = 0.5), width = 0) +
    xlab("Quantity of Interest") +
    labs(color = "Method", shape = "Method") +
    scale_color_manual(values = cols[c("Single Interaction","Fully Moderated","Post-Double Selection","ALasso")]) +
    scale_shape_manual(values = shapes[c("Single Interaction","Fully Moderated","Post-Double Selection","ALasso")]) +
    theme(axis.title.y = element_text(angle = 0, vjust = 0.5, hjust = 0.5)) 
  )
  dev.off()
  
  
  
  cairo_pdf("figures/primary_plot_additional.pdf", width = 8, height = 4, family = "Verdana")
  print(
    ggplot(coefficients[is.element(coefficients$Method,c("Post-Lasso",
                                                         "BART",
                                                         "KRLS")) & coefficients$QOI!="Marginal Effect, South",], 
           aes(x = QOI, y = Estimate, group = Method,
                                                                           colour = Method, pch = Method)) +
      geom_hline(yintercept = 0, linetype = 2, size = 1, colour = "black") +
      geom_point(size = 3, position = position_dodge(width = 0.5)) +
      geom_errorbar(size = 0.5, aes(ymin = lower_ci,
                                    ymax = upper_ci),
                    position = position_dodge(width = 0.5), width = 0) +
      xlab("Quantity of Interest") +
      labs(color = "Method", shape = "Method") +
      scale_color_manual(values = cols[c("Post-Lasso","BART","KRLS")]) +
      scale_shape_manual(values = shapes[c("Post-Lasso","BART","KRLS")]) +
      theme(axis.title.y = element_text(angle = 0, vjust = 0.5, hjust = 0.5)) 
  )
  dev.off()
 
